Introduction

Preparation

Load Libraries

library(tidyverse)
library(kableExtra)
library(viridis)
library(ggExtra)
library(gghighlight)
library(ggridges)
library(gganimate)
library(gifski)
library(png)

Read Data

df <- readr::read_csv("merged.csv")
## Parsed with column specification:
## cols(
##   player = col_character(),
##   year = col_double(),
##   age = col_double(),
##   ranking = col_double(),
##   points = col_double(),
##   tourn_played = col_double(),
##   matches = col_double(),
##   surface = col_character(),
##   percentage_service = col_double(),
##   percentage_return = col_double(),
##   games_won_service = col_double(),
##   games_total_service = col_double(),
##   games_won_return = col_double(),
##   games_total_return = col_double()
## )

Set Options

#ggplotの文字化け対策
theme_set(theme_bw(base_family = "HiraKakuProN-W3"))

Data Check

glimpse()

  • player: 選手名
  • year: 年
  • age: 年齢
  • ranking: 世界ランキング
  • points: ランキングポイント
  • tourn_played: 出場した大会数
  • matches: 試合数
  • surface: コートの種類(Allは全コート分)
  • percentage_service: サービスゲームの獲得率(keep率)
  • percentage_return: リターンゲームの獲得率(break率)
  • games_won_service: 獲得したサービスゲーム数
  • games_total_service: 全サービスゲーム数
  • games_won_return: 獲得したリターンゲーム数
  • games_total_return: 全リターンゲーム数
df %>% dplyr::glimpse()
## Observations: 8,678
## Variables: 14
## $ player              <chr> "Pete Sampras", "Michael Stich", "Guy Forge…
## $ year                <dbl> 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1…
## $ age                 <dbl> 20, 23, 26, 25, 21, 31, 24, 20, 20, 27, 22,…
## $ ranking             <dbl> 6, 4, 7, 1, 2, 5, 3, 45, 16, 20, 50, 69, 26…
## $ points              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tourn_played        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ matches             <dbl> 71, 99, 84, 93, 78, 73, 62, 46, 64, 77, 55,…
## $ surface             <chr> "All", "All", "All", "All", "All", "All", "…
## $ percentage_service  <dbl> 87.39, 87.26, 86.15, 85.36, 84.53, 84.48, 8…
## $ percentage_return   <dbl> 26.00, 24.10, 24.56, 33.33, 29.52, 29.44, 2…
## $ games_won_service   <dbl> 721, 925, 784, 892, 754, 773, 668, 496, 559…
## $ games_total_service <dbl> 825, 1060, 910, 1045, 892, 915, 792, 590, 6…
## $ games_won_return    <dbl> 209, 255, 223, 348, 258, 272, 215, 122, 146…
## $ games_total_return  <dbl> 804, 1058, 908, 1044, 874, 924, 797, 587, 6…

summary()

df %>% summary()
##     player               year           age           ranking    
##  Length:8678        Min.   :1991   Min.   :17.00   Min.   :  1   
##  Class :character   1st Qu.:1997   1st Qu.:23.00   1st Qu.: 17   
##  Mode  :character   Median :2004   Median :26.00   Median : 36   
##                     Mean   :2004   Mean   :25.93   Mean   : 40   
##                     3rd Qu.:2011   3rd Qu.:28.00   3rd Qu.: 59   
##                     Max.   :2018   Max.   :40.00   Max.   :100   
##                                    NA's   :1790    NA's   :1790  
##      points       tourn_played     matches        surface         
##  Min.   :    0   Min.   : 0.0   Min.   :  2.0   Length:8678       
##  1st Qu.:  565   1st Qu.:20.0   1st Qu.:  5.0   Class :character  
##  Median :  900   Median :24.0   Median : 16.0   Mode  :character  
##  Mean   : 1278   Mean   :20.7   Mean   : 23.1                     
##  3rd Qu.: 1475   3rd Qu.:27.0   3rd Qu.: 35.0                     
##  Max.   :16585   Max.   :38.0   Max.   :105.0                     
##  NA's   :1790    NA's   :1790                                     
##  percentage_service percentage_return games_won_service
##  Min.   :  0.00     Min.   : 0.00     Min.   :   0.0   
##  1st Qu.: 73.67     1st Qu.:17.78     1st Qu.:  58.0   
##  Median : 78.74     Median :22.86     Median : 141.0   
##  Mean   : 77.76     Mean   :22.25     Mean   : 217.9   
##  3rd Qu.: 83.49     3rd Qu.:27.27     3rd Qu.: 329.0   
##  Max.   :100.00     Max.   :51.03     Max.   :1108.0   
##                                                        
##  games_total_service games_won_return games_total_return
##  Min.   :   0.0      Min.   :  0.00   Min.   :   0.00   
##  1st Qu.:  73.0      1st Qu.: 13.00   1st Qu.:  72.25   
##  Median : 182.0      Median : 46.00   Median : 180.00   
##  Mean   : 274.5      Mean   : 67.45   Mean   : 273.64   
##  3rd Qu.: 413.0      3rd Qu.:104.00   3rd Qu.: 412.00   
##  Max.   :1229.0      Max.   :431.00   Max.   :1224.00   
## 

sample data

  • year == 2018 & surface == “All”のデータ
df %>%
  dplyr::filter(year == 2018 & surface == "All") %>% 
  dplyr::arrange(ranking) %>% 
  kableExtra::kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
player year age ranking points tourn_played matches surface percentage_service percentage_return games_won_service games_total_service games_won_return games_total_return
Novak Djokovic 2018 31 1 9045 17 65 All 87.23 30.26 724 830 246 813
Rafael Nadal 2018 32 2 7480 13 47 All 86.44 36.55 529 612 216 591
Roger Federer 2018 37 3 6420 17 58 All 91.10 23.89 686 753 178 745
Alexander Zverev 2018 21 4 6385 21 73 All 82.95 27.60 754 909 252 913
Juan Martin del Potro 2018 30 5 5300 18 60 All 87.61 25.20 672 767 192 762
Kevin Anderson 2018 32 6 4710 20 66 All 89.06 15.95 847 951 149 934
Marin Cilic 2018 30 7 4250 19 58 All 86.99 22.79 709 815 183 803
Dominic Thiem 2018 25 8 4095 25 70 All 85.18 24.44 753 884 217 888
Kei Nishikori 2018 29 9 3590 24 64 All 81.38 24.46 638 784 191 781
John Isner 2018 33 10 3155 23 54 All 93.60 9.42 805 860 80 849
Karen Khachanov 2018 22 11 2835 25 66 All 86.40 23.00 724 838 193 839
Borna Coric 2018 22 12 2480 20 54 All 83.54 26.14 528 632 166 635
Fabio Fognini 2018 31 13 2315 25 64 All 77.26 29.43 615 796 236 802
Kyle Edmund 2018 23 14 2150 23 57 All 84.40 21.95 622 737 162 738
Stefanos Tsitsipas 2018 20 15 2095 30 73 All 85.20 19.69 783 919 177 899
Daniil Medvedev 2018 22 16 1977 27 64 All 81.02 23.14 619 764 177 765
Diego Schwartzman 2018 26 17 1880 26 55 All 74.10 30.67 492 664 207 675
Milos Raonic 2018 28 18 1855 20 45 All 90.80 15.54 533 587 90 579
Marco Cecchinato 2018 26 20 1819 30 46 All 78.20 18.92 470 601 112 592
Nikoloz Basilashvili 2018 26 21 1795 29 55 All 75.92 20.72 514 677 139 671
Pablo Carreno Busta 2018 27 23 1705 22 52 All 78.23 26.17 503 643 168 642
Roberto Bautista Agut 2018 30 24 1605 24 51 All 80.10 26.04 499 623 157 603
Hyeon Chung 2018 22 25 1585 23 47 All 78.28 28.24 429 548 157 556
Richard Gasquet 2018 32 26 1535 25 56 All 78.97 27.01 492 623 168 622
Denis Shapovalov 2018 19 27 1440 27 60 All 82.17 19.40 659 802 154 794
Fernando Verdasco 2018 35 28 1410 28 61 All 79.09 24.81 609 770 193 778
Gael Monfils 2018 32 29 1400 24 49 All 80.65 23.07 525 651 149 646
Gilles Simon 2018 34 30 1370 28 56 All 77.48 28.40 523 675 190 669
Alex de Minaur 2018 19 31 1298 26 48 All 80.07 23.83 478 597 143 600
Steve Johnson 2018 29 33 1190 25 49 All 85.81 17.64 520 606 106 601
Philipp Kohlschreiber 2018 35 34 1125 23 46 All 82.09 19.60 486 592 119 607
Marton Fucsovics 2018 26 36 1122 27 47 All 76.98 26.83 438 569 154 574
Andreas Seppi 2018 34 37 1106 24 45 All 79.03 22.82 471 596 136 596
Frances Tiafoe 2018 20 39 1080 25 52 All 81.76 17.98 538 658 119 662
Jeremy Chardy 2018 31 40 1050 25 45 All 82.69 16.84 473 572 95 564
Adrian Mannarino 2018 30 42 1045 29 54 All 75.82 19.26 511 674 131 680
Nicolas Jarry 2018 23 43 1022 22 45 All 82.97 16.59 526 634 105 633
Joao Sousa 2018 29 44 1017 29 51 All 76.79 22.74 526 685 156 686
Damir Dzumhur 2018 26 47 985 31 54 All 70.27 26.22 442 629 167 637
Dusan Lajovic 2018 28 48 985 25 46 All 77.98 22.99 464 595 137 596
Robin Haase 2018 31 50 965 28 52 All 76.94 21.16 544 707 150 709
Benoit Paire 2018 29 52 935 29 54 All 73.40 23.95 505 688 165 689
Pierre-Hugues Herbert 2018 27 55 903 25 45 All 82.79 16.78 505 610 101 602
Leonardo Mayer 2018 31 56 895 25 47 All 82.06 18.01 485 591 105 583
Jan-Lennard Struff 2018 28 57 875 25 47 All 80.74 18.91 499 618 118 624
Guido Pella 2018 28 58 860 25 45 All 78.61 21.88 463 589 128 585
Peter Gojowczyk 2018 29 59 855 25 48 All 79.57 17.74 444 558 99 558
Albert Ramos-Vinolas 2018 30 65 790 29 46 All 75.44 17.05 424 562 97 569
Mischa Zverev 2018 31 69 750 31 48 All 76.79 20.18 430 560 115 570

Visualize

surfaceごとのkeep/break率の分布(Top50 players)

  • Clay, Grass, Hard(+All)コート毎における、Top50選手のkeep/break率の分布. 試合数が少ない場合、データのバラツキが多くなるので年間5試合以上でfilter.
  • Clayはbreak率が高く、Grassはkeep率が高い傾向. HardはAllと同じような分布.
  • 当たり前だが上位選手ほどkeep/break率共に高い.
plot_tmp <- function(data){
  data %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlim(0, 55) +
  ylim(55, 100) +
  xlab("break[%]") +
  ylab("keep[%]")
}

df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5) %>% 
  split(.$surface) %>% 
  purrr::map(~ plot_tmp(.x))
## $All

## 
## $Clay

## 
## $Grass

## 
## $Hard

  • ggExtra::ggMarginal()でヒストグラムも一緒に
    (purrr::map()がうまく動かなかったのでsurfaceごとにコード書いた…)

  • All

df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Clay
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "Clay") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Grass
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "Grass") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Hard
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "Hard") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • keep[%]の分布
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5) %>% 
  ggplot(aes(x = percentage_service, y = as.factor(surface), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 1, rel_min_height = 0.01) +
  scale_fill_viridis(name = "keep[%]", option = "C") +
  xlab("keep[%]") +
  ylab("surface")
## Picking joint bandwidth of 1.18

  • break[%]の分布
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5) %>% 
  ggplot(aes(x = percentage_return, y = as.factor(surface), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 1, rel_min_height = 0.01) +
  scale_fill_viridis(name = "break[%]", option = "C") +
  xlab("break[%]") +
  ylab("surface")
## Picking joint bandwidth of 1.19

surfaceごとのkeep/break率の分布(Top10 players)

  • 傾向はTop50 playersの場合と同じ.
  • Clay, Grass, Hard(+All)コート毎における、Top50選手のkeep/break率の分布. 試合数が少ない場合、データのバラツキが多くなるので年間5試合以上でfilter.
  • Clayはbreak率が高く、Grassはkeep率が高い傾向. HardはAllと同じような分布.
  • 当たり前だが上位選手ほどkeep/break率共に高い.
plot_tmp <- function(data){
  data %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlim(0, 55) +
  ylim(55, 100) +
  xlab("break[%]") +
  ylab("keep[%]")
}

df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5) %>% 
  split(.$surface) %>% 
  purrr::map(~ plot_tmp(.x))
## $All

## 
## $Clay

## 
## $Grass

## 
## $Hard

  • ggExtra::ggMarginal()でヒストグラムも一緒に
    (Top50と同様、purrr::map()がうまく動かなかったのでsurfaceごとにコード書いた…)

  • All

df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Clay
df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5 & surface == "Clay") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Grass
df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5 & surface == "Grass") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

  • Hard
df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5 & surface == "Hard") %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  xlab("break[%]") +
  ylab("keep[%]") + 
  theme(legend.position = "bottom") -> p
  
  p %>% ggExtra::ggMarginal(type = "histogram", fill = "blue")

surfaceごとのkeep/break率の分布(BIG4 & Top10)

  • BIG4(Roger Federer, Rafael Nadal, Novak Djokovic, Andy Murray)についてkeep/break率を見てみる.
player_list <- c("Roger Federer", "Rafael Nadal", "Novak Djokovic", "Andy Murray" , "others(Top10)")
color_list <- c("Roger Federer" = "#ffbb00", "Rafael Nadal" = "#fb6542", "Novak Djokovic" = "#375e97", "Andy Murray" = "#3f671c", "others(Top10)" = "gray80")

plot_tmp <- function(data){
  data %>% 
  dplyr::mutate(player =  ifelse(player %in% player_list, player, "others(Top10)")) %>% 
  transform(player = factor(player, levels = player_list)) %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=player, size=matches)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = color_list) +
  xlab("break[%]") +
  ylab("keep[%]")
}

df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5) %>% 
  split(.$surface) %>% 
  purrr::map(~ plot_tmp(.x))
## $All

## 
## $Clay

## 
## $Grass

## 
## $Hard

surfaceごとのkeep/break率の分布(錦織圭)

  • 錦織選手についてもkeep/break率を見てみる.
  • 2014~2016年はkeep/break率ともに高い.
  • どちらかといえば右下(keep率は低いがbreak率が高い傾向)にいると思っていたがそうでもなかった.
plot_tmp <- function(data){
  data %>% 
  ggplot(aes(x=percentage_return, y=percentage_service, color=year, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  gghighlight(player == "Kei Nishikori") +
  xlab("break[%]") +
  ylab("keep[%]")
}

df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5) %>% 
  split(.$surface) %>% 
  purrr::map(~ plot_tmp(.x))
## $All

## 
## $Clay

## 
## $Grass

## 
## $Hard

keep/break率の分布推移(Animation)

  • surface == Allにおけるkeep/break率分布の推移をanimationで確認
  • なんとなく左上(サーブ優位傾向)に移動しているような気がする…
df %>%
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "All") %>%
  ggplot(aes(x=percentage_return, y=percentage_service, color=ranking, size=matches)) +
  geom_point() +
  scale_color_viridis() +
  labs(title = 'year: {frame_time}', x = 'break[%]', y = 'keep[%]') +
  transition_time(as.integer(year)) +
  ease_aes('linear')

keep/break率の分布推移

  • 微妙な変化だが、徐々にサービス優位になってる様子

  • keep[%]

df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x = percentage_service, y = as.factor(year), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.05) +
  scale_fill_viridis(name = "keep[%]", option = "C") +
  xlab("keep[%]") +
  ylab("year") +
  labs(title = 'keep percentage of Top50 players')
## Picking joint bandwidth of 1.88

  • break[%]
df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x = percentage_return, y = as.factor(year), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.05) +
  scale_fill_viridis(name = "break[%]", option = "C") +
  xlab("break[%]") +
  ylab("year") +
  labs(title = 'break percentage of Top50 players')
## Picking joint bandwidth of 1.75

年齢分布の推移

  • Top50、Top10ともに年齢が年々増加している. 特にTop10は顕著(というか顔ぶれ変わってない)
  • 2018年はTop50における20代前半の選手の割合が増加.
  • 2017年にTop10選手の顔ぶれが若返ったかに思えたが2018年は元に戻った模様.

  • Top50 players

df %>% 
  dplyr::filter(ranking <= 50 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x = age, y = as.factor(year), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.05) +
  scale_fill_viridis(name = "age", option = "C") +
  xlab("age") +
  ylab("year") +
  labs(title = 'Age of Top50 players')
## Picking joint bandwidth of 1.3

  • Top10 players
df %>% 
  dplyr::filter(ranking <= 10 & matches >= 5 & surface == "All") %>% 
  ggplot(aes(x = age, y = as.factor(year), fill = ..x..)) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.05) +
  scale_fill_viridis(name = "age", option = "C") +
  xlab("age") +
  ylab("year") +
  labs(title = 'Age of Top10 players')
## Picking joint bandwidth of 1.34